/* $XFree86: xc/programs/Xserver/hw/xfree86/vga256/drivers/mxic/mxic_accel.c,v 1.1.2.5 1997/06/29 08:43:38 dawes Exp $ */

/*
 *
 * Copyright 1995-1997 The XFree86 Project, Inc.
 *
 */

/*
 * The accel file for the ViRGE driver.  
 * 
 * Created 20/03/97 by Sebastien Marineau
 * Revision: 
 * [0.1] 23/03/97: Added bitblts and filled rects, and GE init code.
 * [0.2] 25/03/97: Added CPU to screen color expansion code, 
 *                 8x8 mono color expanded fills, 8x8 color fills.
 * [0.3] 27/03/97: Fixed a few bugs, added planemask support to bitblts, 
 *                 transparency and planemask to 8x8 mono expands and 
 *                 planemask to CPU-Screen color expands. Remaining
 *                 bugs with 8x8 mono are hopefully fixed. 
 *       28/03/97: Started work on accelerated lines. 
 *       29/03/97: Took out ROP definitions and moved them to mxic_rop.c
 * [0.4] 01/04/97: All basic accelerated features now support a planemask.
 *                 Lines basically work, but may not match cfb yet. 
 * [0.5] 14/04/97: Further optimisations, implemented option "pci_retry" which
 *                 relies on PCI bus to do WaitQueues(). Added support for
 *                 32bpp bitblits and solid rects using GE in 16bpp mode.
 * [0.6] 20/04/97: Fix bug for bitblits of width 49..56. Added new MACROs
 *                 to write to registers, now the most used ViRGE drawing 
 *                 are cached in memory and only rewritten when they change. 
 * [0.7] 21/04/97: Add support for FilledTrapezoid. Right now, this passes
 *                 xtest but they may not match cfb for larger polygons. 
 *
 * Note: we use a few macros to query the state of the coprocessor. 
 * WaitIdle() waits until the GE is idle.
 * WaitIdleEmpty() waits until the GE is idle and its FIFO is empty.
 * WaitCommandEmpty() waits until the command FIFO is empty. The command FIFO
 *       is what handles direct framebuffer writes. We should call this 
 *       before starting any GE functions to make sure that there are no
 *       framebuffer writes left in the FIFO. 
 */
/*
 * Ported to MX8625x by Stephen H.L.Wang
 *
 * version 0.2.1 - 11/8/1998
 *	1. No change.
 *
 * version 0.2alpha - 12/29/1997
 *     The following code does not support all accelerator functions.
 *     Currently only ScreenToScreenCopy and FillRectSolid supported.
 */

#include <math.h>
#include "vga256.h"
#include "xf86.h"
#include "vga.h"
#include "xf86xaa.h"
#include "xf86_OSlib.h"
#include "mxic_driver.h"
#include "mxic_rop.h"

extern MXICPRIV mxicPriv;

/* Globals used in driver */
extern pointer mxicMmioMem;
int mxicAccelCmd = 0;
static int mxicDummyTransferArea;
static int mxicSavedCmd = 0;
static int mxicSavedCmd2 = 0;
static int mxicSavedRectCmdForLine = 0;
static int mxicSyncForLineBug = 0;
static int mxicLineHWClipSet = 0;

/* These are variables which hold cached values for some virge registers.
 * The important thing to remember is that these registers must be always be 
 * set using the "caching" version of the macros.
 */
static unsigned int mxicCached_CMD_SET;
static unsigned int mxicCached_CLIP_TL;
static unsigned int mxicCached_CLIP_BR;
static unsigned int mxicCached_MONO_PATTERN0;
static unsigned int mxicCached_MONO_PATTERN1;
static unsigned int mxicCached_PAT_FGCLR;
static unsigned int mxicCached_PAT_BGCLR;
static unsigned int mxicCached_RWIDTH_HEIGHT;

/* Temporary to see if caching works */
static int mxicCacheHit = 0, mxicCacheMiss = 0;

/* Forward declaration of fucntions used in the driver */
void MXICAccelSync();
void MXICAccelInit();
void MXICAccelInit24();
void MXICSetupForScreenToScreenCopy();
void MXICSubsequentScreenToScreenCopy();
void MXICSetupForScreenToScreenCopy24();
void MXICSubsequentScreenToScreenCopy24();
void MXICSetupForFillRectSolid();
void MXICSubsequentFillRectSolid();
void MXICSetupForFillRectSolid24();
void MXICSubsequentFillRectSolid24();
void MXICSetupForCPUToScreenColorExpand();
void MXICSubsequentCPUToScreenColorExpand();
void MXICSetupFor8x8PatternColorExpand();
void MXICSubsequent8x8PatternColorExpand();
void MXICSetupForFill8x8Pattern();
void MXICSubsequentFill8x8Pattern();
void MXICSubsequentTwoPointLine();
void MXICSetClippingRectangle();
void MXICSubsequentFillTrapezoidSolid();
void MXICWriteImageTransferArea();
Bool MXICROPHasSrc();
Bool MXICROPHasDst();



/* Acceleration init function, sets up pointers to our accelerated functions */
/*
 * The following function sets up the supported acceleration. Call it from
 * the FbInit() function in the SVGA driver. Do NOT initialize any hardware
 * in here. 
 */

#ifdef DEBUGTRACE
char *ScrWidthStr[] = {"640", "800", "1024", "1152", "1280", "1600", "2048"};
char *PixelDepthStr[] = {"8BPP", "16BPP", "15BPP", "15BPP0", "32BPP", "24BPP"};    
#endif

static unsigned scrfmt = 0;

void 
MXICAccelInit() 
{
#ifdef DEBUGTRACE
    ErrorF("MXICAccelInit() - ");
#endif

/* Set-up our GE command primitive */
    
    /* setup mode index needed by COP */
    switch (vga256InfoRec.virtualX)
    {
    case 640:
        mxicPriv.CopScrWidth = COPSCRWIDTH_640;
	scrfmt |= COPSCRWIDTH640;
        break;
    case 800:
        mxicPriv.CopScrWidth = COPSCRWIDTH_800;
	scrfmt |= COPSCRWIDTH800;
        break;
    case 1024:
        mxicPriv.CopScrWidth = COPSCRWIDTH_1024;
	scrfmt |= COPSCRWIDTH1024;
        break;
    case 1152:
        mxicPriv.CopScrWidth = COPSCRWIDTH_1152;
	scrfmt |= COPSCRWIDTH1152;
        break;
    case 1280:
        mxicPriv.CopScrWidth = COPSCRWIDTH_1280;
	scrfmt |= COPSCRWIDTH1280;
        break;
    case 1600:
        mxicPriv.CopScrWidth = COPSCRWIDTH_1600;
	scrfmt |= COPSCRWIDTH1600;
        break;
    case 2048:
        mxicPriv.CopScrWidth = COPSCRWIDTH_2048;
	scrfmt |= COPSCRWIDTH2048;
        break;
    default:
        mxicPriv.CopScrWidth = COPSCRWIDTH_640;
	scrfmt |= COPSCRWIDTH640;
        break;
    }

    switch (vga256InfoRec.bitsPerPixel)
    {
    case 8:
        mxicPriv.CopPixFmt = COPPIXFMT_8BPP;
	scrfmt |= COPPIXEL8BPP;
        break;
    case 16:
        mxicPriv.CopPixFmt = COPPIXFMT_16BPP;
	scrfmt |= COPPIXEL16BPP;
        break;
    case 15:
        mxicPriv.CopPixFmt = COPPIXFMT_15BPP0;
	scrfmt |= COPPIXEL15BPP0;
        break;
    case 32:
        mxicPriv.CopPixFmt = COPPIXFMT_32BPP;
	scrfmt |= COPPIXEL32BPP;
        break;
    case 24:
        mxicPriv.CopPixFmt = COPPIXFMT_24BPP;  /* not support, should not goes here??? */
	scrfmt |= COPPIXEL24BPP;
        break;
    }

    /* This statement is suspecious working under MX8625x.  It seems that
       after change mode to extended mode from standard VGA mode, this setting
       is corrupted.  So it may have to be set again in respective setup
       routine.
     */
    SET_SCR_FMT(scrfmt);

#ifdef DEBUGTRACE
    ErrorF("%s, %s, %x\n", ScrWidthStr[mxicPriv.CopScrWidth], PixelDepthStr[mxicPriv.CopPixFmt],scrfmt);
#endif

    mxicAccelCmd = 0;

    /* General acceleration flags */

    xf86AccelInfoRec.Flags = 
	 /* PIXMAP_CACHE | */
         BACKGROUND_OPERATIONS |
         COP_FRAMEBUFFER_CONCURRENCY | 
         NO_SYNC_AFTER_CPU_COLOR_EXPAND |
         HARDWARE_PATTERN_MONO_TRANSPARENCY |
         HARDWARE_PATTERN_BIT_ORDER_MSBFIRST |  
         HARDWARE_PATTERN_PROGRAMMED_BITS |
         HARDWARE_PATTERN_SCREEN_ORIGIN;

     xf86AccelInfoRec.Sync = MXICAccelSync;

    /* ScreenToScreen copies */

    xf86AccelInfoRec.SetupForScreenToScreenCopy =
        MXICSetupForScreenToScreenCopy;
    xf86AccelInfoRec.SubsequentScreenToScreenCopy =
        MXICSubsequentScreenToScreenCopy;
    xf86GCInfoRec.CopyAreaFlags = NO_TRANSPARENCY; 


    /* Filled rectangles */

    xf86AccelInfoRec.SetupForFillRectSolid = 
        MXICSetupForFillRectSolid;
    xf86AccelInfoRec.SubsequentFillRectSolid = 
        MXICSubsequentFillRectSolid;
    xf86GCInfoRec.PolyFillRectSolidFlags = 0;  

    xf86AccelInfoRec.ColorExpandFlags = SCANLINE_PAD_DWORD |
					CPU_TRANSFER_PAD_DWORD | 
					VIDEO_SOURCE_GRANULARITY_PIXEL |
					BIT_ORDER_IN_BYTE_MSBFIRST |
					LEFT_EDGE_CLIPPING;
/*
    xf86AccelInfoRec.SetupForCPUToScreenColorExpand =
             MXICSetupForCPUToScreenColorExpand;
    xf86AccelInfoRec.SubsequentCPUToScreenColorExpand =
             MXICSubsequentCPUToScreenColorExpand;
    xf86AccelInfoRec.CPUToScreenColorExpandBase = 
             (void *) &IMG_TRANS;
    xf86AccelInfoRec.CPUToScreenColorExpandRange = 32768;
*/
 
    /* These are the 8x8 pattern fills using color expansion */
/*
    xf86AccelInfoRec.SetupFor8x8PatternColorExpand = 
            MXICSetupFor8x8PatternColorExpand;
    xf86AccelInfoRec.Subsequent8x8PatternColorExpand = 
            MXICSubsequent8x8PatternColorExpand;  
*/

    /* These are the 8x8 color pattern fills */
/*
    xf86AccelInfoRec.SetupForFill8x8Pattern = 
            MXICSetupForFill8x8Pattern;
    xf86AccelInfoRec.SubsequentFill8x8Pattern = 
            MXICSubsequentFill8x8Pattern; 
*/

    /* These are the accelerated line functions */
    /* They are only semi-functionnal and do not work fully yet */

/*    xf86AccelInfoRec.SubsequentTwoPointLine = 
            MXICSubsequentTwoPointLine;
    xf86AccelInfoRec.SetClippingRectangle = 
            MXICSetClippingRectangle;
    xf86AccelInfoRec.SubsequentFillTrapezoidSolid = 
            MXICSubsequentFillTrapezoidSolid;   */


    /*
     * Finally, we set up the video memory space available to the pixmap
     * cache. In this case, all memory from the end of the virtual screen
     * to the end of video memory minus 1K, can be used. If you haven't
     * enabled the PIXMAP_CACHE flag, then these lines can be omitted.
     */

     xf86InitPixmapCache(&vga256InfoRec, vga256InfoRec.virtualY *
        vga256InfoRec.displayWidth * vga256InfoRec.bitsPerPixel / 8,
        vga256InfoRec.videoRam * 1024 -1024);

     /* And these are screen parameters used to setup the GE */

     mxicPriv.Width = vga256InfoRec.displayWidth;
     mxicPriv.Bpp = vgaBitsPerPixel / 8;
     mxicPriv.Bpl = mxicPriv.Width * mxicPriv.Bpp;
     mxicPriv.ScissB = (vga256InfoRec.videoRam * 1024 - 1024) / mxicPriv.Bpl;
     if (mxicPriv.ScissB > 2047)
         mxicPriv.ScissB = 2047;

} 

void 
MXICAccelInit24() 
{
#ifdef DEBUGTRACE
    ErrorF("MXICAccelInit24()\n");
#endif
     mxicPriv.Width = vga256InfoRec.displayWidth;
     mxicPriv.Bpp = vgaBitsPerPixel / 8;
     mxicPriv.Bpl = mxicPriv.Width * mxicPriv.Bpp;
     mxicPriv.ScissB = (vga256InfoRec.videoRam * 1024 - 1024) / mxicPriv.Bpl;
     if (mxicPriv.ScissB > 2047)
         mxicPriv.ScissB = 2047;
}

/* The sync function for the GE */
/*
 * This is the implementation of the Sync() function.
 *
 * To avoid pipeline/cache/buffer flushing in the PCI subsystem and the VGA
 * controller, we might replace this read-intensive code with a dummy
 * accelerator operation that causes a hardware-blocking (wait-states) until
 * the running operation is done.
 */
void
MXICAccelSync()
{
#ifdef DEBUGTRACE
    ErrorF("MXICAccelSync()\n");
#endif

    WaitCommandEmpty(); 
    WaitIdleEmpty(); 
    CACHE_SETB_CLIP_L_T(0, 0);
    CACHE_SETB_CLIP_R_B(mxicPriv.Width, mxicPriv.ScissB);

/* Workaround for possible bug when pattern fills follow lines */
/*
    if(mxicSyncForLineBug){
        CACHE_SETB_CMD_SET(mxicAccelCmd | CMD_BITBLT | ROP_D | CMD_AUTOEXEC);
        SETB_RSRC_XY(0,0);
        SETB_RDEST_XY(0,0);
        SETB_RWIDTH_HEIGHT(0,0);
        WaitIdleEmpty(); 
        mxicSyncForLineBug = FALSE;
        } 
*/
}


/* This next function performs a reset of the graphics engine and 
 * fills in some GE registers with default values.                  
 */

void
MXICGEReset()
{
    unsigned char tmp;

#ifdef DEBUGTRACE
    ErrorF("MXICGEReset()\n");
#endif
    outb(vgaCRIndex, SystemControlRegIdx);
    tmp = inb(vgaCRReg);
    outb(vgaCRReg, tmp & ~0x70);
    outb(vgaCRReg, tmp | 0x70);
    usleep(10000);
    WaitIdleEmpty();

    SETB_DEST_SRC_STR(mxicPriv.Bpl, mxicPriv.Bpl); 
    SETB_SRC_BASE(0);
    SETB_DEST_BASE(0);   

    /* Now write some default rgisters and reset cached values */
    mxicCached_CLIP_TL = -1;
    mxicCached_CLIP_BR = -1;
    CACHE_SETB_CLIP_L_T(0, 0);
    CACHE_SETB_CLIP_R_B(mxicPriv.Width, mxicPriv.ScissB);
    mxicCached_MONO_PATTERN0 = 0;
    mxicCached_MONO_PATTERN1 = 0;
    CACHE_SETB_MONO_PAT0(~0);
    CACHE_SETB_MONO_PAT1(~0);   

    mxicCached_RWIDTH_HEIGHT = -1;
    mxicCached_PAT_FGCLR = -1;
    mxicCached_PAT_BGCLR = -1;
    mxicCached_CMD_SET = -1;

    ErrorF("MXIC register cache hits: %d misses: %d\n",mxicCacheHit, mxicCacheMiss);    
    mxicCacheHit = 0; mxicCacheMiss = 0;
}



/*
 * This is the implementation of the SetupForScreenToScreenCopy function
 * that sets up the coprocessor for a subsequent batch of
 * screen-to-screen copies. Remember, we don't handle transparency,
 * so the transparency color is ignored.
 */
/* These are the ScreenToScreen bitblt functions. We support all ROPs, all
 * directions, and a planemask by adjusting the ROP and using the mono pattern
 * registers. There is no support for transparency. 
 */

void 
MXICSetupForScreenToScreenCopy(xdir, ydir, rop, planemask,
transparency_color)
    int xdir, ydir;
    int rop;
    unsigned planemask;
    int transparency_color;
{

    int cmd = mxicAccelCmd | SOURCEMAP_COLOR;
 
#ifdef DEBUGTRACE
    ErrorF("MXICSetupForScreenToScreenCopy(%i,%i,%i,%x,%i)\n",xdir,ydir,rop,
            planemask,transparency_color);
#endif

    SET_SCR_FMT(scrfmt);

    if((planemask & mxicPriv.PlaneMask) != mxicPriv.PlaneMask) {     
        cmd |= (CMD_AUTOEXEC | mxicAlu_pat[rop] | CMD_BITBLT | 
            MIX_MONO_PATT);
        }
    else {
        cmd |= (CMD_AUTOEXEC | mxicAlu[rop] | CMD_BITBLT);
        }
    if(xdir == -1) cmd |= CMD_XP;
    if(ydir == -1) cmd |= CMD_YP;
    mxicSavedCmd = cmd;
   
    WaitQueue(4);
/*
    CACHE_SETB_PAT_FG_CLR(planemask & mxicPriv.PlaneMask);
    CACHE_SETB_CMD_SET(cmd); 
    CACHE_SETB_MONO_PAT0(~0);
    CACHE_SETB_MONO_PAT1(~0);   
    SETB_PAT_FG_CLR(planemask & mxicPriv.PlaneMask);
*/
}

/*
 * This is the implementation of the SubsequentForScreenToScreenCopy
 * that sends commands to the coprocessor to perform a screen-to-screen
 * copy of the specified areas, with the parameters from the SetUp call.
 * In this sample implementation, the direction must be taken into
 * account when calculating the addresses (with coordinates, it might be
 * a little easier).
 *
 */

void 
MXICSubsequentScreenToScreenCopy(x1, y1, x2, y2, w, h)
int x1, y1, x2, y2, w, h;
{
    int new_width = w;

    if(MXICROPHasDst(mxicSavedCmd)) {
        new_width = MXICCheckBltWidth(w);  
        WaitQueue(5);
        if(new_width != w) {
            CACHE_SETB_CMD_SET(mxicSavedCmd | CMD_HWCLIP);
            CACHE_SETB_CLIP_L_R(x2, x2 + w -1); 
            }
        else 
            CACHE_SETB_CLIP_L_R(0, mxicPriv.Width); 
        }
    else {
        new_width = w;
        WaitQueue(4);
        CACHE_SETB_CLIP_L_R(0, mxicPriv.Width); 
        }

    SETB_RSRC_XY( (mxicSavedCmd & CMD_XP) ? (x1 + new_width - 1) : x1, 
        (mxicSavedCmd & CMD_YP) ? (y1 + h - 1) : y1);
    SETB_RDEST_XY( (mxicSavedCmd & CMD_XP) ? (x2 + new_width - 1) : x2,
        (mxicSavedCmd & CMD_YP) ? (y2 + h - 1) : y2);
    SETB_RWIDTH_HEIGHT(new_width - 1, h - 1);

#ifdef DEBUGTRACE
    ErrorF("MXICSubsequentForScreenToScreenCopy(%x,%x,%x,%x,%x,%x)cmd=%x;src=(%x,%x),dst=(%x,%x),dim=(%x,%x)\n",
	    x1, y1, x2, y2, w, h, mxicSavedCmd, 
    (mxicSavedCmd & CMD_XP) ? (x1 + new_width - 1) : x1, 
        (mxicSavedCmd & CMD_YP) ? (y1 + h - 1) : y1,
    (mxicSavedCmd & CMD_XP) ? (x2 + new_width - 1) : x2,
        (mxicSavedCmd & CMD_YP) ? (y2 + h - 1) : y2,
    new_width-1,h-1);
#endif
    SETB_CMD_SET(mxicSavedCmd); /* ??? */
    WaitIdleEmpty(); 
}


/*
 * This is the implementation of the SetupForFillRectSolid function
 * that sets up the coprocessor for a subsequent batch for solid
 * rectangle fills.
 */
/*
 * Even though the ViRGE supports the RECT operation directly, we use
 * BITBLTS with no source. Speed appears to be the same, 
 * and this avoids lockups which are experienced at 16bpp and 24bpp
 * with RECT when the ROP is not ROP_P.
 * We also support a planemask, by using a ROP which is adjusted "on-the-fly",
 * because I got tired of computing all these ROPs and putting them in tables.
 * The ROP that we need is computed as (ROP & S) | (D & ~S). S is the 
 * planemask which is mono-expanded. We get pretty large gains by doing this:
 * about 10 times faster than the non-accelerated routines for larger 
 * rectangles.
 */ 

void 
MXICSetupForFillRectSolid(color, rop, planemask)
int color, rop;
unsigned planemask;
{
    int cmd = mxicAccelCmd;

    SET_SCR_FMT(scrfmt);

    cmd |= (CMD_AUTOEXEC | CMD_BITBLT | SOURCEMAP_FGCOLOR | MIX_MONO_PATT);

    if((planemask & mxicPriv.PlaneMask) == mxicPriv.PlaneMask)
    {
        cmd |= mxicAlu_sp[rop];
        mxicSavedCmd2 = NO_MONO_FILL;

        if(vgaBitsPerPixel == 8) 
            SETB_SRC_FG_CLR(color | (color << 8));
        else
            SETB_SRC_FG_CLR(color);
    }
    else
    {
        cmd |= mxicAlu_MonoTrans[rop];
        cmd |= /*MIX_CPUDATA |*/ MIX_MONO_SRC | CMD_ITA_DWORD;
        if(MXICROPHasSrc(cmd)) 
            mxicSavedCmd2 = NEED_MONO_FILL;
        else 
            mxicSavedCmd2 = NO_MONO_FILL;

        if(vgaBitsPerPixel == 8) 
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask | 
                        ((planemask & mxicPriv.PlaneMask)<< 8));
        else  
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask);

    }
    MONO_PATTERN0dw[0] = 0xffffffff;
    MONO_PATTERN0dw[1] = 0xffffffff;

    mxicSavedCmd = cmd;

#ifdef DEBUGTRACE
    ErrorF("MXICSetupForFillRectSolid(%x,%x,%x)cmd=%x\n", color, rop, planemask, cmd);
#endif

/*
    cmd |= (CMD_AUTOEXEC | CMD_BITBLT | MIX_MONO_PATT | CMD_XP | CMD_YP);
    if((planemask & mxicPriv.PlaneMask) == mxicPriv.PlaneMask){
        cmd |= mxicAlu_sp[rop];
        mxicSavedCmd = NO_MONO_FILL;
        }
    else {
        cmd |= mxicAlu_MonoTrans[rop];
        cmd |= MIX_CPUDATA | MIX_MONO_SRC | CMD_ITA_DWORD;
        if(MXICROPHasSrc(cmd)) 
            mxicSavedCmd = NEED_MONO_FILL;
        else 
            mxicSavedCmd = NO_MONO_FILL;

        WaitQueue(1);
        if(vgaBitsPerPixel == 8) 
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask | 
                        ((planemask & mxicPriv.PlaneMask)<< 8));
        else  
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask);
        } 

    mxicSavedRectCmdForLine = cmd;
    WaitQueue(4);
*/
/*
    CACHE_SETB_CMD_SET(cmd);
    CACHE_SETB_MONO_PAT0(~0);
    CACHE_SETB_MONO_PAT1(~0);
    CACHE_SETB_PAT_FG_CLR(color);
*/
}
    
    
/*
 * This is the implementation of the SubsequentForFillRectSolid function
 * that sends commands to the coprocessor to fill a solid rectangle of
 * the specified location and size, with the parameters from the SetUp
 * call.
 *
 */
void 
MXICSubsequentFillRectSolid(x, y, w, h)
int x, y, w, h;
{
    int dwords_to_transfer, new_width;

    if(mxicSavedCmd2 != NEED_MONO_FILL) {  
#ifdef DEBUGTRACE
        ErrorF("MXICSubsequentForFillRectSolid(%x,%x,%x,%x)\n", x,y,w,h);
#endif

        WaitQueue(2);
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(w - 1, h - 1);
        SETB_CMD_SET(mxicSavedCmd); 
        }
    else {                               
#ifdef DEBUGTRACE
        ErrorF("MXICSubsequentForFillRectSolid(%x,%x,%x,%x):NEEDMONOFILL\n", x,y,w,h);
#endif
        new_width = MXICCheckLSPN(w, 1);  
        WaitQueue(4);
        if(new_width != w) {
            CACHE_SETB_CMD_SET(mxicSavedRectCmdForLine | CMD_HWCLIP);
            CACHE_SETB_CLIP_L_R(x, x + w -1); 
            w = new_width;
            }
        else 
            CACHE_SETB_CLIP_L_R(0, mxicPriv.Width); 
        dwords_to_transfer = ((w + 31) / 32) * h;
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(w - 1, h - 1);
        SETB_CMD_SET(mxicSavedCmd); 
/*        MXICWriteImageTransferArea (dwords_to_transfer, 0xffffffff);   */
        }
    WaitIdleEmpty(); 
}


/* These are the CPUToScreen color expand functions. We support a planemask
 * and transparency. The planemask is supported by making use of the 
 * mono pattern registers and using a ROP which is adjusted. Transparency
 * is handled through the ViRGE color expansion bitblt. 
 * Unfortunately, we have to use a hack, when the ROP does not contain
 * the source (ex. GX_CLEAR), because in that case, writing to the 
 * image transfer area will hang the chip, and XAA will write to the 
 * transfer area regardless of ROP. 
 */

void
MXICSetupForCPUToScreenColorExpand(bg, fg, rop, planemask)
int bg, fg, rop;
unsigned planemask;
{
    int cmd = mxicAccelCmd;

    if((planemask & mxicPriv.PlaneMask) != mxicPriv.PlaneMask) {
        cmd |= mxicAlu_pat[rop];
        }
    else {
        cmd |= mxicAlu[rop];
        }

    cmd |= (CMD_AUTOEXEC | CMD_BITBLT | MIX_MONO_PATT |
               MIX_CPUDATA | MIX_MONO_SRC |  
               CMD_ITA_DWORD | CMD_HWCLIP);
    if (bg == -1) 
    {
       cmd |= MIX_MONO_TRANSP;    /* transparency */
       SETB_CLRCMP_MASK(0xffffffff);
       SETB_SRC_BG_CLR(0);
       SETB_CLRCMP_CLR(0);
    }
    
    if(MXICROPHasSrc(cmd)) {
       xf86AccelInfoRec.CPUToScreenColorExpandBase = 
             (void *) &IMG_TRANS;
       xf86AccelInfoRec.ColorExpandFlags &= (~CPU_TRANSFER_BASE_FIXED); 
       }
    else {    /* Fix for XAA bug */
       xf86AccelInfoRec.CPUToScreenColorExpandBase = 
             (void *) &mxicDummyTransferArea;
       xf86AccelInfoRec.ColorExpandFlags |= CPU_TRANSFER_BASE_FIXED; 
       }

    WaitQueue(3);
    if(vgaBitsPerPixel == 8) {
        SETB_SRC_FG_CLR(fg | (fg << 8));
        if(bg != -1) SETB_SRC_BG_CLR(bg | (bg << 8));
        }
    else { 
        SETB_SRC_FG_CLR(fg);
        if(bg != -1) SETB_SRC_BG_CLR(bg);
        }
    if((planemask & mxicPriv.PlaneMask) != mxicPriv.PlaneMask) {
        WaitQueue(4);
        CACHE_SETB_MONO_PAT0(~0);
        CACHE_SETB_MONO_PAT1(~0);   
        CACHE_SETB_PAT_FG_CLR(planemask & mxicPriv.PlaneMask);
        }
/*
    CACHE_SETB_CMD_SET(cmd);
*/
    SETB_RSRC_XY(0, 0);   
    mxicSavedCmd = cmd;

#ifdef DEBUGTRACE
    ErrorF("MXICSetupCPUToScreenColorExpand(%x,%x,%x,%x)cmd=%x\n", bg,fg,rop,planemask,cmd);
#endif
}

/*
 * TsengSubsequentCPUToScreenColorExpand() is potentially dangerous:
 *   Not writing enough data to the MMU aperture for CPU-to-screen color
 *   expansion will eventually cause a system deadlock!
 */

void
MXICSubsequentCPUToScreenColorExpand(x, y, w, h, skipleft)
int x, y, w, h, skipleft;
{
#ifdef DEBUGTRACE
    ErrorF("MXICSubsequentCPUToScreenColorExpand(%x,%x,%x,%x,%i)\n", x,y,w,h,skipleft);
#endif

    WaitQueue(3);
    if(skipleft != 0)  
        CACHE_SETB_CLIP_L_R(x + skipleft, mxicPriv.Width); 
    else
        CACHE_SETB_CLIP_L_R(0, mxicPriv.Width); 
    SETB_RDEST_XY(x, y);   
    SETB_RWIDTH_HEIGHT(w - 1, h - 1);
    SETB_CMD_SET(mxicSavedCmd); 
}


/* These functions provide 8x8 mono pattern fills. 
 *
 * Looking at the databook, one would think that you cannot do pattern fills, 
 * but we use the same trick as above with the rectangles, use the BIT_BLT
 * operation with the pattern and src/dst rectangles the same. We also 
 * support a planemask by using an adjusted ROP and doing a fill with 
 * CPU mono source expanded to the planemask (as for the filled rects). 
 *
 * The other thing we would like to do is support transparency. This is 
 * going to be a big win, especially for larger fills. We do the following 
 * for transparent pattern fills: we use the fact that the virge can do any
 * ROP between the S, P and D, and set ourselves up for a cpu-to-screen mono
 * transfer using an adjusted ROP. We transfer the fg color, which gets finally 
 * blitted to screen if the pattern bit is 1. Although this seems like a lot of 
 * overhead, x11perf shows that 1x1 stipples are a little faster,  
 * and 10x10 through 500x500 are 3 times faster.
 * As a bonus, we can also handle the planemask! 
 */ 


void
MXICSetupFor8x8PatternColorExpand(patternx, patterny, bg, fg, rop, planemask)
unsigned patternx, patterny;
int bg, fg, rop;
unsigned planemask;
{
    int i;
    int cmd = mxicAccelCmd;

    cmd |= ( CMD_AUTOEXEC | CMD_BITBLT | MIX_MONO_PATT | CMD_XP | CMD_YP );

    if (bg != -1) {
        if((planemask & mxicPriv.PlaneMask) == mxicPriv.PlaneMask){
            cmd |= mxicAlu_sp[rop];
            mxicSavedCmd2 = NO_MONO_FILL;
            }
        else {
            cmd |= mxicAlu_MonoTrans[rop];
            cmd |= ( MIX_CPUDATA | CMD_ITA_DWORD | CMD_HWCLIP | MIX_MONO_SRC );
            if(MXICROPHasSrc(cmd)) 
                mxicSavedCmd2 = NEED_MONO_FILL; 
            else 
                mxicSavedCmd2 = NO_MONO_FILL;

            WaitQueue(1);
            if(vgaBitsPerPixel == 8) 
                SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask | 
                            ((planemask & mxicPriv.PlaneMask)<< 8));
            else  
                SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask);
            }
        }
    else {
        cmd |= mxicAlu_MonoTrans[rop];
	cmd |= ( MIX_CPUDATA | CMD_ITA_DWORD | CMD_HWCLIP | MIX_MONO_SRC );
        if(MXICROPHasSrc(cmd)) 
            mxicSavedCmd2 = NEED_MONO_FILL; 
        else 
            mxicSavedCmd2 = 0;
        }


    WaitQueue(6);
    CACHE_SETB_MONO_PAT0(patternx);
    CACHE_SETB_MONO_PAT1(patterny);
    CACHE_SETB_CMD_SET(cmd);
    if (bg != -1){
        CACHE_SETB_PAT_FG_CLR(fg);
        CACHE_SETB_PAT_BG_CLR(bg);
        }
    else {
        CACHE_SETB_PAT_FG_CLR(planemask & mxicPriv.PlaneMask);
        CACHE_SETB_PAT_BG_CLR(0);
        if(vgaBitsPerPixel == 8) 
            SETB_SRC_FG_CLR(fg | (fg << 8));
        else  
            SETB_SRC_FG_CLR(fg);
        }

}


void
MXICSubsequent8x8PatternColorExpand(patternx, patterny, x, y, w, h)
unsigned patternx, patterny;
int x, y, w, h;
{
    int dwords_to_transfer;
    int new_width;

    if(mxicSavedCmd2 != NEED_MONO_FILL) {  /* Opaque case, no planemask */
        WaitQueue(3);
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(w - 1, h - 1);
        }

    else {                               /* Transparent case or planemask*/ 
        new_width = MXICCheckLSPN(w, 1);  /* Check for blit bugs*/
                                  
        dwords_to_transfer = h * ((new_width + 31) / 32) ;

        WaitQueue(3);
        if (new_width != w) 
             CACHE_SETB_CLIP_L_R(x, x + w -1); 
        else {
             CACHE_SETB_CLIP_L_R(0, mxicPriv.Width);
             }
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(new_width - 1, h - 1);
        MXICWriteImageTransferArea (dwords_to_transfer, 0xffffffff);
        }  
}


/* These functions implement fills using color 8x8 patterns.
 * The patterns are stored in video memory, but the virge wants
 * them programmed into its pattern registers, so we transfer them from
 * the frame buffer to the GE through the CPU (ouf!). We support a planemask,
 * but not transparency. I guess we could do it, but it would get very messy
 * and is only very rarely used. 
 */

void 
MXICSetupForFill8x8Pattern(patternx, patterny, rop, planemask, trans_col)
unsigned patternx, patterny;
int rop; 
unsigned planemask;
int trans_col;
{
    int *pattern_addr, *color_regs;
    int num_bytes, i;
    int cmd = mxicAccelCmd;

    pattern_addr = (int *) (xf86AccelInfoRec.FramebufferBase + 
                           patternx * vgaBitsPerPixel / 8 +
                           patterny * mxicPriv.Bpl);
    color_regs = (int *) &COLOR_PATTERN0;


    /* Now we transfer to color regs */
    num_bytes = 64 * vgaBitsPerPixel / 8;
    BusToMem(color_regs, pattern_addr, num_bytes);   

    if((planemask & mxicPriv.PlaneMask) == mxicPriv.PlaneMask){ 
        cmd |= mxicAlu_sp[rop];
        mxicSavedCmd2 = NO_MONO_FILL;
        }
    else {
        cmd |= mxicAlu_MonoTrans[rop];
        cmd |= ( CMD_ITA_DWORD | MIX_MONO_SRC | MIX_CPUDATA | CMD_HWCLIP );
        if(MXICROPHasSrc(cmd)) 
            mxicSavedCmd2 = NEED_MONO_FILL;
        else 
            mxicSavedCmd2 = NO_MONO_FILL;

        WaitQueue(1);
        if(vgaBitsPerPixel == 8) 
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask | 
                        ((planemask & mxicPriv.PlaneMask)<< 8));
        else  
            SETB_SRC_FG_CLR(planemask & mxicPriv.PlaneMask);
        } 

    cmd |= (CMD_AUTOEXEC | CMD_BITBLT | MIX_COLOR_PATT | CMD_XP | CMD_YP); 

    WaitQueue(1);
    CACHE_SETB_CMD_SET(cmd);
}




void MXICSubsequentFill8x8Pattern(patternx, patterny, x, y, w, h)
unsigned patternx, patterny;
int x, y, w, h;
{
    int dwords_to_transfer;
    int new_width;

    if(mxicSavedCmd2 != NEED_MONO_FILL){   /* No planemask */
        WaitQueue(2);
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(w - 1, h - 1);
        }
    else {                               /* We have a planemask */ 
        new_width = MXICCheckLSPN(w, 1);  /* Check for blit bugs */                                  
        dwords_to_transfer = h * ((new_width + 31) / 32) ;

        WaitQueue(3);
        if (new_width != w)
             CACHE_SETB_CLIP_L_R(x, x + w -1); 
        else {
             CACHE_SETB_CLIP_L_R(0, mxicPriv.Width); 
             }
        SETB_RDEST_XY(x, y);
        SETB_RWIDTH_HEIGHT(new_width - 1, h - 1);
        MXICWriteImageTransferArea (dwords_to_transfer, 0xffffffff);
        }  
}



void MXICSubsequentTwoPointLine(x1, y1, x2, y2, bias)
int x1, x2, y1, y2, bias;
{
    int cmd = mxicAccelCmd;
    int dx, dy;
    int Xdelta, Xstart, Ystart, Ycount;
    Bool Dir = 0;

    dx = x2 - x1;
    dy = y2 - y1;
    WaitQueue(1); 

    ErrorF("TwoPointLine, x1 %d y1 %d x2 %d y2 %d bias %d\n",
        x1, y1, x2, y2, bias);

    if(y1 > y2) {   /* Things are the right way for ViRGE */
        if(dy != 0) Xdelta = -(dx << 20)/ dy;
        else Xdelta = 0;
        if( dx > dy ){ 
            if (dx > 0) Xstart = (x1 << 20) + dx / 2; 
            else Xstart = (x1 << 20) + dx / 2 + ((1 << 20) -1);
            }
        else {
            Xstart = (x1 << 20);
            }
        Ystart = y1;
        Ycount = abs(y2 - y1) + 1;
	/*
        if (dx > 0) 
            SETL_LXEND0_END1(x1, (bias & 0x100) ? (x2 - 1) : x2); 
        else 
            SETL_LXEND0_END1(x1, (bias & 0x100) ? (x2 + 1) : x2); 
	*/
        if(dx >= 0) Dir = 1;
        }
    else {   /* Things are reversed for ViRGE */
        if(dy != 0) Xdelta = -(dx << 20)/ dy;
        else Xdelta = 0;
        if( dx > dy ){ 
            if (dx > 0) Xstart = (x2 << 20) + dx / 2; 
            else Xstart = (x2 << 20) + dx / 2 + ((1 << 20) -1);
            }
        else {
            Xstart = (x2 << 20);
            }
        Ystart = y2;
        Ycount = abs(y1 - y2) + 1;
	/*
        if (dx > 0)
            SETL_LXEND0_END1(x2, (bias & 0x100) ? (x1 - 1) : x1); 
        else 
            SETL_LXEND0_END1(x2, (bias & 0x100) ? (x1 + 1) : x1); 
	*/
        if(dx <= 0) Dir = 1;
        }

    cmd |= (CMD_LINE | CMD_AUTOEXEC | MIX_MONO_PATT) ;
    cmd |= (mxicSavedRectCmdForLine & (0xff << 17));
    if(mxicLineHWClipSet) cmd |= CMD_HWCLIP ;

    WaitQueue(5);
    SETL_CMD_SET(cmd);
    /* SETL_LDX(Xdelta); */
    SETL_LXSTART(Xstart);
    SETL_LYSTART(Ystart);
    /* SETL_LYCNT(Ycount | (Dir ? 0x80000000 : 0x00));  */

    if(mxicLineHWClipSet) {
        WaitQueue(2);
        CACHE_SETB_CLIP_L_T(0, 0); 
        CACHE_SETB_CLIP_R_B(mxicPriv.Width, mxicPriv.ScissB); 
        mxicLineHWClipSet = FALSE;
        }
  
    WaitQueue(1);  
    CACHE_SETB_CMD_SET(mxicSavedRectCmdForLine); 
    mxicSyncForLineBug = TRUE;
 
}


void MXICSetClippingRectangle(x1, y1, x2, y2)
int x1, y1, x2, y2;
{
    WaitQueue(2);
    CACHE_SETB_CLIP_L_T(x1, y1); 
    CACHE_SETB_CLIP_R_B(x1 + x2, y1+ y2);   
    mxicLineHWClipSet = TRUE;
}


/* This next function is used to write a dword to the image transfer 
 * area. This is used for pattern fills, rectangle fills etc. We
 * also have the option of using DMA to do this.
 */

void
MXICWriteImageTransferArea (dwords, value)
int dwords;
unsigned value;
{
int i, j;
int blocks, left_to_do;
unsigned int *image_transfer;
 

    blocks = dwords / 8192;
    left_to_do = dwords - blocks * 8192;
    for(j = 0; j < blocks ; j ++) {
        image_transfer = (unsigned int *) &IMG_TRANS;
        for(i = 0; i < 8192; i++) {
            *image_transfer++ = value;
	    write_mem_barrier();
	}
    }
    image_transfer = (unsigned int *) &IMG_TRANS;
    for(i = 0; i < left_to_do; i++) {
        *image_transfer++ = value;
	write_mem_barrier();
    }
}

